The first two sections are for reproducibility purposes. The detail of the result is shown in section 3 (Quality Control).
suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(gplots)
library(here)
library(hyperSpec)
library(RColorBrewer)
library(tidyverse)
library(VennDiagram)
library(emoji)
library(ggVennDiagram)
})suppressMessages({
source(here("UPSCb-common/Rtoolbox/src/plotEnrichedTreemap.R"))
source(here("UPSCb-common/src/R/featureSelection.R"))
source(here("UPSCb-common/src/R/volcanoPlot.R"))
source(here("UPSCb-common/src/R/gopher.R"))
source(here("UPSCb-common/src/R/topGoUtilities.R"))
})"line_plot" <- function(dds=dds,vst=vst,gene_id=gene_id){
#message(paste("Plotting",gene_id))
sel <- grepl(gene_id,rownames(vst))
stopifnot(sum(sel)==1)
p <- ggplot(bind_cols(as.data.frame(colData(dds)),
data.frame(value=vst[sel,])),
aes(x=Genotype,y=value,fill=Genotype)) +
geom_boxplot() +
geom_jitter(color="black") +
scale_y_continuous(name="VST expression") +
ggtitle(label=paste("Expression for: ",gene_id))
suppressMessages(suppressWarnings(plot(p)))
return(NULL)
}"extract_results" <- function(dds,vst,contrast,
padj=0.01,lfc=0.5,
plot=TRUE,verbose=TRUE,
export=TRUE,default_dir=here("data/analysis/DE"),
default_prefix="DE-",
labels=colnames(dds),
sample_sel=1:ncol(dds),
expression_cutoff=0,
debug=FALSE,filter=c("median",NULL),...){
# get the filter
if(!is.null(match.arg(filter))){
filter <- rowMedians(counts(dds,normalized=TRUE))
message("Using the median normalized counts as default, set filter=NULL to revert to using the mean")
}
# validation
if(length(contrast)==1){
res <- results(dds,name=contrast,filter = filter)
} else {
res <- results(dds,contrast=contrast,filter = filter)
}
stopifnot(length(sample_sel)==ncol(vst))
if(plot){
par(mar=c(5,5,5,5))
volcanoPlot(res)
par(mar=mar)
}
# a look at independent filtering
if(plot){
plot(metadata(res)$filterNumRej,
type="b", ylab="number of rejections",
xlab="quantiles of filter")
lines(metadata(res)$lo.fit, col="red")
abline(v=metadata(res)$filterTheta)
}
if(verbose){
message(sprintf("The independent filtering cutoff is %s, removing %s of the data",
round(metadata(res)$filterThreshold,digits=5),
names(metadata(res)$filterThreshold)))
max.theta <- metadata(res)$filterNumRej[which.max(metadata(res)$filterNumRej$numRej),"theta"]
message(sprintf("The independent filtering maximises for %s %% of the data, corresponding to a base mean expression of %s (library-size normalised read)",
round(max.theta*100,digits=5),
round(quantile(counts(dds,normalized=TRUE),probs=max.theta),digits=5)))
}
if(plot){
qtl.exp=quantile(counts(dds,normalized=TRUE),probs=metadata(res)$filterNumRej$theta)
dat <- data.frame(thetas=metadata(res)$filterNumRej$theta,
qtl.exp=qtl.exp,
number.degs=sapply(lapply(qtl.exp,function(qe){
res$padj <= padj & abs(res$log2FoldChange) >= lfc &
! is.na(res$padj) & res$baseMean >= qe
}),sum))
if(debug){
plot(ggplot(dat,aes(x=thetas,y=qtl.exp)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("base mean expression") +
geom_hline(yintercept=expression_cutoff,
linetype="dotted",col="red"))
p <- ggplot(dat,aes(x=thetas,y=qtl.exp)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_log10("base mean expression") +
geom_hline(yintercept=expression_cutoff,
linetype="dotted",col="red")
suppressMessages(suppressWarnings(plot(p)))
plot(ggplot(dat,aes(x=thetas,y=number.degs)) +
geom_line() + geom_point() +
geom_hline(yintercept=dat$number.degs[1],linetype="dashed") +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Number of DE genes"))
plot(ggplot(dat,aes(x=thetas,y=number.degs[1] - number.degs),aes()) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Cumulative number of DE genes"))
plot(ggplot(data.frame(x=dat$thetas[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Number of DE genes per interval"))
plot(ggplot(data.frame(x=dat$qtl.exp[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_continuous("base mean of expression") +
scale_y_continuous("Number of DE genes per interval"))
p <- ggplot(data.frame(x=dat$qtl.exp[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_log10("base mean of expression") +
scale_y_continuous("Number of DE genes per interval") +
geom_vline(xintercept=expression_cutoff,
linetype="dotted",col="red")
suppressMessages(suppressWarnings(plot(p)))
}
}
sel <- res$padj <= padj & abs(res$log2FoldChange) >= lfc & ! is.na(res$padj) &
res$baseMean >= expression_cutoff
if(verbose){
message(sprintf(paste(
ifelse(sum(sel)==1,
"There is %s gene that is DE",
"There are %s genes that are DE"),
"with the following parameters: FDR <= %s, |log2FC| >= %s, base mean expression > %s"),
sum(sel),padj,
lfc,expression_cutoff))
}
# proceed only if there are DE genes
if(sum(sel) > 0){
val <- rowSums(vst[sel,sample_sel,drop=FALSE])==0
if (sum(val) >0){
warning(sprintf(paste(
ifelse(sum(val)==1,
"There is %s DE gene that has",
"There are %s DE genes that have"),
"no vst expression in the selected samples"),sum(val)))
sel[sel][val] <- FALSE
}
if(export){
if(!dir.exists(default_dir)){
dir.create(default_dir,showWarnings=FALSE,recursive=TRUE,mode="0771")
}
write.csv(res,file=file.path(default_dir,paste0(default_prefix,"results.csv")))
write.csv(res[sel,],file.path(default_dir,paste0(default_prefix,"genes.csv")))
}
if(plot & sum(sel)>1){
heatmap.2(t(scale(t(vst[sel,sample_sel]))),
distfun = pearson.dist,
hclustfun = function(X){hclust(X,method="ward.D2")},
trace="none",col=hpal,labRow = FALSE,
labCol=labels[sample_sel],...
)
}
}
return(list(all=rownames(res[sel,]),
up=rownames(res[sel & res$log2FoldChange > 0,]),
dn=rownames(res[sel & res$log2FoldChange < 0,])))
}extractEnrichmentResults <- function(enrichment,
diff.exp=c("all","up","dn"),
go.namespace=c("BP","CC","MF"),
genes=NULL,export=FALSE,plot=TRUE,
default_dir=here("analysis/DE"),
default_prefix="DE"){
# process args
diff.exp <- match.arg(diff.exp)
de <- ifelse(diff.exp=="all","none",
ifelse(diff.exp=="dn","down",diff.exp))
# sanity
if(is.null(unlist(enrichment)) | length(unlist(enrichment)) == 0){
message("No GO enrichment for",names(enrichment))
} else {
# write out
if(export){
write_tsv(bind_rows(enrichment),
file=here(default_dir,
paste0(default_prefix,"-genes_GO-enrichment.tsv")))
if(!is.null(genes)){
write_tsv(
enrichedTermToGenes(genes=genes,terms=enrichment$id,url=url,mc.cores=16L),
file=here(default_dir,
paste0(default_prefix,"-enriched-term-to-genes.tsv"))
)
}
}
if(plot){
gocatname <- c(BP="Biological Process",
CC="Cellular Component",
MF="Molecular Function")
degname <- c(all="all DEGs",
up="up-regulated DEGs",
dn="down-regulated DEGs")
lapply(names(enrichment),function(n){
lapply(names(enrichment[[n]]),function(de){
lapply(names(enrichment[[n]][[de]]),function(gocat){
dat <- enrichment[[n]][[de]][[gocat]]
if(is.null(dat)){
message("No GO enrichment for ",degname[de]," in category ",gocatname[gocat])
} else {
dat$GeneRatio <- dat$Significant/dat$Annotated
dat$Pvalue <- as.numeric(dat$FDR)
dat$Count <- as.numeric(dat$Significant)
dat <- dat[order(dat$GeneRatio),]
dat$Term <- factor(dat$Term, levels = unique(dat$Term))
ggplot(dat, aes(x =Term, y = GeneRatio, color = Pvalue, size = Count)) +
geom_point() +
scale_color_gradient(low = "red", high = "blue") +
theme_bw() +
ylab("GeneRatio") +
xlab("") +
ggtitle(paste0("GO enrichment: ",degname[de]," ",gocatname[gocat])) +
coord_flip()
}
})
})
})
}
}
}load(here("analysis/salmon/dds_mergeTechRep.rda"))
dds$Genotype <- relevel(dds$Genotype,ref = "T89")Normalisation for visualisation
## using 'avgTxLength' from assays(dds), correcting for library size
The targeted locus for line 26 is
Potri.006G174000/Potra001877g14982 (Potra2n6c13821) And for
the line 03, it is Potri.018G096200/Potra001273g10998
(Potra2n6c13821)
kogene <- "Potra2n6c13821"
stopifnot(kogene %in% rownames(vst))
dev.null <- line_plot(dds=dds,vst=vst,gene_id = kogene)Even though both knockout lines have lower expression of Potra2n6c13821 than parental T89, line26 has lower expression of Potra2n6c13821 than in line3.
exp <- bind_cols(as.data.frame(colData(dds)),data.frame(value=vst[kogene,]))
t.test(exp$value[exp$Genotype == "line26"], exp$value[exp$Genotype == "T89"])##
## Welch Two Sample t-test
##
## data: exp$value[exp$Genotype == "line26"] and exp$value[exp$Genotype == "T89"]
## t = -8.3332, df = 4.1565, p-value = 0.0009577
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.3056495 -0.6602609
## sample estimates:
## mean of x mean of y
## 2.728281 3.711236
##
## Welch Two Sample t-test
##
## data: exp$value[exp$Genotype == "line3"] and exp$value[exp$Genotype == "T89"]
## t = -4.011, df = 5.7834, p-value = 0.007582
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.4919329 -0.1170351
## sample estimates:
## mean of x mean of y
## 3.406752 3.711236
P-values for the hypothesis that the mean expression level of knocked-out lines are not equal to the parent are <0.001 for line26 and <0.01 for line3.
👉 Please note that, even though the expression levels are significantly lower, there are still read counts on the gene in knockout lines.
| Name | V2 |
|---|---|
| SNRPG | Potra2n6c13472 |
| SNRPB/N | Potra2n11c22477 |
| SNRPD1 | Potra2n5c10994 |
| SNRPD2 | Potra2n14c27408 |
| SNRPD3 | Potra2n2c6364 |
| SNRPF | Potra2n8c17320 |
| SNRPE | Potra2n6c13821 |
| LSM1A-B | Potra2n1c1466 |
| LSM2 | Potra2n4c10338 |
| LSM3A-B | Potra2n5c11140 |
| LSM4 | Potra2n2c4537 |
| LSM6A-B | Potra2n8c17320 |
| LSM7 | Potra2n1c2324 |
| LSM8 | Potra2n1c2419 |
genelist <- goi$V2[!(goi$V2 == "Potra2n6c13821")]
stopifnot(all(genelist %in% rownames(vst)))
dev.null <- lapply(genelist,line_plot,dds=dds,vst=vst)The reads mapped on all SM and LSM are selected and used for alignment on T89 haplotypes fasta. We do not have annotations for the haplotypes but from sequence similarity to Potra2n6c13821 we can guess where is the gene.
T89 Haplotype 1 (h1tg000012l)
T89 Haplotype 2 (h2tg000021l)
👉 The structure of the target gene in knockout lines is still similar to T89 parent on both haplotypes.
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Check the different contrasts
## [1] "Intercept" "Genotype_line26_vs_T89" "Genotype_line3_vs_T89"
line26 <- extract_results(dds=dds,vst=vst,contrast="Genotype_line26_vs_T89",
default_dir = here("analysis/DE"),
default_prefix = "Line26_", export=FALSE,
labels=dds$BioID,
sample_sel = dds$Genotype %in% c("line26","T89"),
cexCol=.9, plot = TRUE, verbose = TRUE)## Using the median normalized counts as default, set filter=NULL to revert to using the mean
## Loading required package: LSD
## The independent filtering cutoff is 0.14916, removing 28.82535% of the data
## The independent filtering maximises for 28.82535 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 8765 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
line03 <- extract_results(dds=dds,vst=vst,contrast="Genotype_line3_vs_T89",
default_dir = here("analysis/DE"),
default_prefix = "Line03_", export=FALSE,
labels=dds$BioID,
sample_sel = dds$Genotype %in% c("line3","T89"),
cexCol=.9, plot = TRUE, verbose = TRUE)## Using the median normalized counts as default, set filter=NULL to revert to using the mean
## The independent filtering cutoff is 0.14916, removing 28.82535% of the data
## The independent filtering maximises for 28.82535 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 8064 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
ggVennDiagram(lapply(res.list,"[[","all"), label_alpha = 0, label = "count") +
scale_fill_gradient(low="white",high = "dodgerblue") +
scale_x_continuous(expand = expansion(mult = .1))Here we used topGO to calculate GO enrichment of DEGs vs all genes. Any term having P-value less than 0.05 is shown. We didn’t calculate adjusted P-values because they did not give signicant enrichment. Please interpret the output with caution.
background <- rownames(vst)[featureSelect(vst,dds$Genotype,exp=0.00001)]
goannot <- prepAnnot(mapping = "/mnt/picea/storage/reference/Populus-tremula/v2.2/gopher/gene_to_go.tsv")res.list <- list("Line 26"=line26)
suppressMessages(enr.list <- lapply(res.list,function(r){
lapply(r,topGO,background=background,annotation=goannot,alpha=0.05,p.adjust="none")
}))
suppressWarnings(extractEnrichmentResults(enr.list))## [[1]]
## [[1]][[1]]
## [[1]][[1]][[1]]
##
## [[1]][[1]][[2]]
##
## [[1]][[1]][[3]]
##
##
## [[1]][[2]]
## [[1]][[2]][[1]]
##
## [[1]][[2]][[2]]
##
## [[1]][[2]][[3]]
##
##
## [[1]][[3]]
## [[1]][[3]][[1]]
##
## [[1]][[3]][[2]]
##
## [[1]][[3]][[3]]
res.list <- list("Line 03"=line03)
suppressMessages(enr.list <- lapply(res.list,function(r){
lapply(r,topGO,background=background,annotation=goannot,alpha=0.05,p.adjust="none")
}))
suppressWarnings(extractEnrichmentResults(enr.list))## [[1]]
## [[1]][[1]]
## [[1]][[1]][[1]]
##
## [[1]][[1]][[2]]
##
## [[1]][[1]][[3]]
##
##
## [[1]][[2]]
## [[1]][[2]][[1]]
##
## [[1]][[2]][[2]]
##
## [[1]][[2]][[3]]
##
##
## [[1]][[3]]
## [[1]][[3]][[1]]
##
## [[1]][[3]][[2]]
##
## [[1]][[3]][[3]]
res.list <- list("CommonLine26andLine03"=list("all"=intersect(line26$all,line03$all),
"up"=intersect(line26$up,line03$up),
"dn"=intersect(line26$dn,line03$dn)))
suppressMessages(enr.list <- lapply(res.list,function(r){
lapply(r,topGO,background=background,annotation=goannot,alpha=0.05,p.adjust="none")
}))
suppressWarnings(extractEnrichmentResults(enr.list))## [[1]]
## [[1]][[1]]
## [[1]][[1]][[1]]
##
## [[1]][[1]][[2]]
##
## [[1]][[1]][[3]]
##
##
## [[1]][[2]]
## [[1]][[2]][[1]]
##
## [[1]][[2]][[2]]
##
## [[1]][[2]][[3]]
##
##
## [[1]][[3]]
## [[1]][[3]][[1]]
##
## [[1]][[3]][[2]]
##
## [[1]][[3]][[3]]
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] LSD_4.1-0 topGO_2.54.0
## [3] SparseM_1.81 GO.db_3.18.0
## [5] AnnotationDbi_1.64.1 graph_1.80.0
## [7] limma_3.58.1 ggVennDiagram_1.4.9
## [9] emoji_15.0 VennDiagram_1.7.3
## [11] futile.logger_1.4.3 lubridate_1.9.3
## [13] forcats_1.0.0 stringr_1.5.1
## [15] dplyr_1.1.4 purrr_1.0.2
## [17] readr_2.1.4 tidyr_1.3.0
## [19] tibble_3.2.1 tidyverse_2.0.0
## [21] RColorBrewer_1.1-3 hyperSpec_0.100.0
## [23] xml2_1.3.6 ggplot2_3.4.4
## [25] lattice_0.21-8 here_1.0.1
## [27] gplots_3.1.3 DESeq2_1.42.0
## [29] SummarizedExperiment_1.32.0 Biobase_2.62.0
## [31] MatrixGenerics_1.14.0 matrixStats_1.2.0
## [33] GenomicRanges_1.54.1 GenomeInfoDb_1.38.5
## [35] IRanges_2.36.0 S4Vectors_0.40.2
## [37] BiocGenerics_0.48.1 data.table_1.14.10
##
## loaded via a namespace (and not attached):
## [1] DBI_1.2.0 bitops_1.0-7 deldir_2.0-2
## [4] formatR_1.14 testthat_3.2.1 rlang_1.1.2
## [7] magrittr_2.0.3 RSQLite_2.3.4 compiler_4.3.1
## [10] png_0.1-8 vctrs_0.6.5 pkgconfig_2.0.3
## [13] crayon_1.5.2 fastmap_1.1.1 XVector_0.42.0
## [16] labeling_0.4.3 caTools_1.18.2 utf8_1.2.4
## [19] rmarkdown_2.25 tzdb_0.4.0 bit_4.0.5
## [22] xfun_0.41 zlibbioc_1.48.0 cachem_1.0.8
## [25] jsonlite_1.8.8 blob_1.2.4 highr_0.10
## [28] DelayedArray_0.28.0 BiocParallel_1.36.0 jpeg_0.1-10
## [31] parallel_4.3.1 R6_2.5.1 bslib_0.6.1
## [34] stringi_1.8.3 brio_1.1.4 jquerylib_0.1.4
## [37] Rcpp_1.0.11 knitr_1.45 Matrix_1.6-0
## [40] timechange_0.2.0 tidyselect_1.2.0 rstudioapi_0.15.0
## [43] abind_1.4-5 yaml_2.3.8 codetools_0.2-19
## [46] KEGGREST_1.42.0 withr_2.5.2 evaluate_0.23
## [49] lambda.r_1.2.4 Biostrings_2.70.1 pillar_1.9.0
## [52] KernSmooth_2.23-22 generics_0.1.3 vroom_1.6.4
## [55] rprojroot_2.0.4 RCurl_1.98-1.13 hms_1.1.3
## [58] munsell_0.5.0 scales_1.3.0 gtools_3.9.5
## [61] glue_1.6.2 lazyeval_0.2.2 tools_4.3.1
## [64] interp_1.1-5 locfit_1.5-9.8 latticeExtra_0.6-30
## [67] colorspace_2.1-0 GenomeInfoDbData_1.2.11 cli_3.6.2
## [70] futile.options_1.0.1 fansi_1.0.6 S4Arrays_1.2.0
## [73] gtable_0.3.4 sass_0.4.8 digest_0.6.33
## [76] SparseArray_1.2.2 farver_2.1.1 memoise_2.0.1
## [79] htmltools_0.5.7 lifecycle_1.0.4 httr_1.4.7
## [82] statmod_1.5.0 bit64_4.0.5
Created by Fai Theerarat Kochakarn